##########################################################################################################################################################
### Setup

### Set up environment

# Clear workspace
rm(list=ls())

# Load libraries
library(tidyverse)
library(magrittr)
library(abmisc)
library(igraph)
library(showtext)
library(rcartocolor)
library(grDevices)



##########################################################################################################################################################
### Make Figure 1


### Format the data

# Define associations
flow <- tibble(from=c("Hand-Collected", "Shor/McCarty",  "LegiScan Bills", "LegiScan Bills", "LegiScan Bills", 
                      "Missing Vote\nCheck", "Bill Action Regex", "Committee\nSponsor Check", 
                      "LegiScan Votes", "Aggregate", "Bill-Level Data"),
               to=c("Bill-Level Data", "Bill-Level Data", "Committee\nSponsor Check", "Bill Action Regex", "Missing Vote\nCheck", 
                    "Bill-Level Data", "Bill-Level Data", "Bill-Level Data", 
                    "Aggregate", "Bill-Level Data", "North Carolina\nData"))

# Make into a graph, pull out coordinates in tree form
flow_graph <- graph_from_data_frame(flow, directed = TRUE)
coords <- layout_as_tree(flow_graph)
colnames(coords) <- c("x", "y")

# Get an output dataframe
output_df <- as_tibble(coords) %>%
  mutate(step = vertex_attr(flow_graph, "name"),
         x = x*-1,
         type = factor(c("Source", "Source", "Source", "Action", "Action", "Action", "Source", "Action", "Final Data", "Final Data"), 
                       levels=c("Source", "Action", "Final Data")),
         y = ifelse(step=='Bill-Level Data', 0, y)) %>% # Put bill-level data and NC data on same row 
  mutate(x = ifelse(step=='North Carolina\nData', 0.25, ifelse(step=='Bill-Level Data', 2.25, x)))

# Make boxes around the nodes
plot_nodes <- output_df %>%
  mutate(xmin = x - 0.4,
         xmax = x + 0.4,
         ymin = y - 0.3,
         ymax = y + 0.3)

# Add variable names to plot_nodes
var_names <- c(hand_collected = "- Veto Thresh.\n- Line-Item Veto\n- Seat Ratio\n- Term Diff.\n- Div. Govt.\n- Maj. Share",
               shor_mccart = "- Party Diff.",
               legi_bills = "- Year\n- Chamber\n- # Cosp.\n- Maj. Spon.",
               miss_check = "- High-Missingness",
               regex = "- Reached Vote\n- Became Law",
               cmte_spon = "- Committee Spon.",
               legi_votes = "",
               aggregate = "- Unity Vote\n- Prop. Yeas\n- Prop. Min. Yeas",
               bill_data = "",
               nc_data = ""
               )
plot_nodes$vars <- var_names

# Make the edges
plot_edges <- flow %>%
  mutate(id = row_number()) %>%
  pivot_longer(cols = c("from", "to"),
               names_to = "s_e",
               values_to = "step") %>%
  left_join(plot_nodes, by = "step") %>%
  dplyr::select(-c(type, y, xmin, xmax)) %>%
  mutate(y = ifelse(s_e == "from", ymin, ymax)) %>%
  dplyr::select(-c(ymin, ymax, vars))
plot_edges$y[plot_edges$s_e=='from' & plot_edges$step=='Bill-Level Data'] <- 0 # Make it so the arrow from bill-level to NC is from the middle of the box
plot_edges$y[plot_edges$s_e=='to' & plot_edges$step=='North Carolina\nData'] <- 0
plot_edges$x[plot_edges$s_e=='from' & plot_edges$step=='Bill-Level Data'] <- 1.85
plot_edges$x[plot_edges$s_e=='to' & plot_edges$step=='North Carolina\nData'] <- 0.65


### Add NC data work "manually"

# Node info
nc_dat_nodes <- data.frame(x=rep(-1.75, 2), y=c(-0.45, 0.25), step=c("NC Journals\n(Hand-Collected)", "Open States"), type=rep("Source", 2)) %>%
  mutate(xmin = x - 0.4,
         xmax = x + 0.4,
         ymin = y - 0.3,
         ymax = y + 0.3,
         vars = c("- Unity Vote\n- Prop. Yeas\n- Prop. Min. Yeas", "- Reached Vote\n- Became Law\n"))
plot_nodes <- rbind(plot_nodes, nc_dat_nodes)

# Edge info
nc_edge_info <- data.frame(id=c(12, 12, 13, 13), s_e=rep(c("from", "to"), 2), 
                           step=c("NC Journals\n(Hand-Collected)", "North Carolina\nData",
                                  "Open States", "North Carolina\nData"),
                           x=rep(c(-1.35, -0.15),2), y=c(-0.45, 0, 0.25, 0))
plot_edges <- rbind(plot_edges, nc_edge_info)



### Plot

# Draw the rectangles
p <- ggplot() +
  geom_rect(data = plot_nodes,
            mapping = aes(xmin = xmin, ymin = ymin, 
                          xmax = xmax, ymax = ymax, 
                          fill = type, colour = type),
            alpha = 0.5, linewidth=1.5) 

# Add labels
p <- p + 
  geom_text(data = plot_nodes,
            mapping = aes(x = x, y = y+0.225, label = step), 
            fontface='bold')

# Add variable names
p <- p + 
  geom_text(data = plot_nodes,
            mapping = aes(x=x, y=y-0.035, label=vars))

# Draw the arrows
p <- p + 
  geom_path(data = plot_edges,
            mapping = aes(x = x, y = y, group = id),
            arrow = arrow(length = unit(0.3, "cm"), type = "closed"))

# Prettify
p <- p + 
  theme_void() +
  scale_fill_manual(values=c("#525252", "#bdbdbd", "#f0f0f0")) +
  scale_color_manual(values=rep("#525252", 3)) +
  theme(legend.text=element_text(size=15)) +
  guides(color=guide_legend(title=''), fill=guide_legend(title=''))


### Save
ggsave(filename="results/fig1_data_flowchart.pdf", plot=p, width=12, height=10) 
